library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.1     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(openintro) # a bunch of data sets
## Please visit openintro.org for free statistics materials
## 
## Attaching package: 'openintro'
## The following object is masked from 'package:ggplot2':
## 
##     diamonds
## The following objects are masked from 'package:datasets':
## 
##     cars, trees
library(broom)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Mulitple and Logistic Regression

Allow you to model and predict numeric and categorical variables using mulitple input variables. Learn how to fit, visualize, and interptret the models.

What if you have two groups?

Extend simple linear regression to a number fo variabels which can be both numeric and categorical Logistic regression allows us to model a binary response.

Car example Want to look at fuel efficiency over time while controlling for engine size Use a parallel slopes model

We will fit a parallel slopes model using lm(). In addition to the data argument, lm() needs to know which variables you want to include in your regression model, and how you want to include them. It accomplishes this using a formula argument. A simple linear regression formula looks like y ~ x, where y is the name of the response variable, and x is the name of the explanatory variable. Here, we will simply extend this formula to include multiple explanatory variables. A parallel slopes model has the form y ~ x + z, where z is a categorical explanatory variable, and x is a numerical explanatory variable.

#mulitple regression
glimpse(mtcars)
## Observations: 32
## Variables: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2,…
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140…
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 18…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92,…
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.1…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.…
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1,…
mod <- lm(hwy ~ displ + factor(year), data = mpg) # need to make sure that r recognizes year as a categorical variable which is why you must use factor() 

mod_aug <- augment(mod)

# Plot lines with geom_line for predicted values
ggplot(data = mod_aug, mapping = aes(y = hwy, x = displ, color = factor.year.)) +
  geom_point(alpha = 0.3) +
  geom_line(aes(y = .fitted), lwd = 1)

Parallel slopes models are so-named because we can visualize these models in the data space as not one line, but two parallel lines.

Interpreting parallel slopes coefficients

Most often our goal is to inpret the coefficients. What does the model tell us about the relationship between engine size and fuel economy in the context of the year the cars were manufactured?

Coefficients:
(Intercept) displ factor(year)2008
35.276 -3.611 1.402

35.28 is the expected fuel economy of a car from 1999 with an engine size of 0 liters. Of course, there is no car with an engine of 0 liters so this is just an interpretation. the model tells us that for every additional liter in engine size there is a decrease of fuel economy by 3.611. 1.4 is the average difference between 1999 and 2008 in terms of mpg for the same type of car. Other notes

  • There is only one slope and it relates to the numeric explanatory variable
  • Pay attention to the reference variable
  • Units are important - every coef has units that realte to the response variable
  • The “after controlling for phrasing” is crucial to having valid understanding
data(marioKart)
glimpse(marioKart)
## Observations: 143
## Variables: 12
## $ ID         <dbl> 150377422259, 260483376854, 320432342985, 28040522467…
## $ duration   <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3,…
## $ nBids      <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16…
## $ cond       <fct> new, used, new, new, new, new, used, new, used, used,…
## $ startPr    <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99,…
## $ shipPr     <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00,…
## $ totalPr    <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.9…
## $ shipSp     <fct> standard, firstClass, firstClass, standard, media, st…
## $ sellerRate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, …
## $ stockPhoto <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, …
## $ wheels     <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1,…
## $ title      <fct> "~~ Wii MARIO KART &amp; WHEEL ~ NINTENDO Wii ~ BRAND…
# fit parallel slopes
mod_kart <- lm(totalPr ~ wheels + cond, data = marioKart)

mod_kart_aug <- augment(mod_kart) # %>% filter(totalPr < 100)
glimpse(mod_kart_aug)
## Observations: 143
## Variables: 10
## $ totalPr    <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.9…
## $ wheels     <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1,…
## $ cond       <fct> new, used, new, new, new, new, used, new, used, used,…
## $ .fitted    <dbl> 47.88342, 48.72916, 47.88342, 47.88342, 58.09954, 37.…
## $ .se.fit    <dbl> 3.532896, 2.696310, 3.532896, 3.532896, 3.375000, 5.2…
## $ .resid     <dbl> 3.666579, -11.689162, -2.383421, -3.883421, 12.900457…
## $ .hat       <dbl> 0.02093127, 0.01219196, 0.02093127, 0.02093127, 0.019…
## $ .sigma     <dbl> 24.504954, 24.486658, 24.506118, 24.504709, 24.482054…
## $ .cooksd    <dbl> 1.640983e-04, 9.543509e-04, 6.933998e-05, 1.840819e-0…
## $ .std.resid <dbl> 0.15174745, -0.48163062, -0.09864186, -0.16072186, 0.…
# Plot lines with geom_line for predicted values
ggplot(data = mod_kart_aug, mapping = aes(y = totalPr, x = wheels, color = cond)) +
  geom_point(alpha = 0.3) +
  geom_line(aes(y = .fitted), lwd = 1) +
  ylim(25,80)
## Warning: Removed 2 rows containing missing values (geom_point).

Interpretation of slope coef - For each additional wheel, the expected price of a MarioKart increases by $7.23 regardless of whether it is new or used.

data(babies)
glimpse(babies)
## Observations: 1,236
## Variables: 8
## $ case      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ bwt       <int> 120, 113, 128, 123, 108, 136, 138, 132, 120, 143, 140,…
## $ gestation <int> 284, 282, 279, NA, 282, 286, 244, 245, 289, 299, 351, …
## $ parity    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ age       <int> 27, 33, 28, 36, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36…
## $ height    <int> 62, 64, 64, 69, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61…
## $ weight    <int> 100, 135, 115, 190, 125, 93, 178, 140, 125, 136, 120, …
## $ smoke     <int> 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, …
#visually examine data
ggplot(data = babies, mapping = aes(x = age, y = bwt, color = parity)) + geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).

# build model for birthweight as a fucntion of mothers age and whether or not it is her first child
lm(bwt ~ age + parity, data = babies)
## 
## Call:
## lm(formula = bwt ~ age + parity, data = babies)
## 
## Coefficients:
## (Intercept)          age       parity  
##   118.27782      0.06315     -1.65248
#visually examine data
ggplot(data = babies, mapping = aes(x = gestation, y = bwt, color = factor(smoke))) + geom_point(alpha = .6)
## Warning: Removed 13 rows containing missing values (geom_point).

#  build a model for birthweight as a function of the length of gestation and the mother's smoking status
lm(bwt ~ gestation + smoke, data = babies)
## 
## Call:
## lm(formula = bwt ~ gestation + smoke, data = babies)
## 
## Coefficients:
## (Intercept)    gestation        smoke  
##     -0.9317       0.4429      -8.0883

Model Fit, Residuals, and Prediction

Residuals are the differences between the actual value and the predicted(fitted) value from model (line of best fit)
Model fitting procedure minimizes the residuals over the entire data space.
Coef of determination (R^2) is the same as simple linear regression. It measures the percentage of the variability in the response variable that is explained by the model. To compute this, we define R2=1−SSE/SST

The adjusted R^2 is used to determine model performcae for multiple linear regression. The only differece is that a penalty is applied as the number of explanatory variables increases.
R2adj=1−(SSE/SST)⋅(n−1/n−p−1)

We can see both measures in the output of the summary() function on our model object.

Augment(mod) returns a data frame with the original values along with other calculations from the model inclduing the fitted values and errors. This function is used in the tidyverse from the broom package.
Making out of sample predictions - you can use augment(mod, newdata)

# See how the r^2 values change just by adding an extra variable called noise to the marioKart dataset

# R^2 and adjusted R^2
summary(mod_kart)
## 
## Call:
## lm(formula = totalPr ~ wheels + cond, data = marioKart)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.719  -6.462  -2.513   0.487 267.565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.6673     5.2795   7.135 4.78e-11 ***
## wheels       10.2161     2.6740   3.821   0.0002 ***
## condused      0.8457     4.5855   0.184   0.8539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.42 on 140 degrees of freedom
## Multiple R-squared:  0.1091, Adjusted R-squared:  0.09638 
## F-statistic: 8.573 on 2 and 140 DF,  p-value: 0.0003075
# add random noise
mario_kart_noisy <- marioKart %>%
  mutate(noise = rnorm(nrow(marioKart)))
  
# compute new model
mod_kart_2 <- lm(totalPr ~ wheels + cond + noise, data = mario_kart_noisy)

# new R^2 and adjusted R^2
summary(mod_kart_2)
## 
## Call:
## lm(formula = totalPr ~ wheels + cond + noise, data = mario_kart_noisy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.383  -6.528  -2.843   0.476 267.855 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.1682     5.3096   7.189 3.67e-11 ***
## wheels       10.0273     2.6831   3.737 0.000271 ***
## condused      0.3991     4.6130   0.087 0.931187    
## noise        -1.9499     2.1011  -0.928 0.354977    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.43 on 139 degrees of freedom
## Multiple R-squared:  0.1146, Adjusted R-squared:  0.09548 
## F-statistic: 5.997 on 3 and 139 DF,  p-value: 0.0007144
# Make predictions
augment(mod_kart_2)
## # A tibble: 143 x 11
##    totalPr wheels cond    noise .fitted .se.fit .resid   .hat .sigma
##      <dbl>  <int> <fct>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1    51.6      1 new   -1.31      50.8    4.70  0.793 0.0370   24.5
##  2    37.0      1 used   1.15      46.4    3.72 -9.32  0.0231   24.5
##  3    45.5      1 new    0.923     46.4    3.88 -0.896 0.0252   24.5
##  4    44        1 new    0.280     47.6    3.54 -3.65  0.0210   24.5
##  5    71        2 new    0.702     56.9    3.63 14.1   0.0221   24.5
##  6    45        0 new   -0.139     38.4    5.35  6.56  0.0479   24.5
##  7    37.0      0 used   0.241     38.1    3.52 -1.08  0.0208   24.5
##  8    54.0      2 new    0.712     56.8    3.64 -2.84  0.0222   24.5
##  9    47        1 used   0.0808    48.4    2.72 -1.44  0.0124   24.5
## 10    50        1 used  -1.07      50.7    3.43 -0.688 0.0197   24.5
## # … with 133 more rows, and 2 more variables: .cooksd <dbl>,
## #   .std.resid <dbl>

Fitting a model with interaction

Including an interaction term in a model is easy—we just have to tell lm() that we want to include that new variable. The use of the colon (:) here means that the interaction between x and z will be a third term in the model.

# interaction term
# lm(y ~ x + z + x:z, data = mydata)
# include interaction
lm(totalPr ~ cond + duration + cond:duration, data = marioKart)
## 
## Call:
## lm(formula = totalPr ~ cond + duration + cond:duration, data = marioKart)
## 
## Coefficients:
##       (Intercept)           condused           duration  
##            58.268            -17.564             -1.966  
## condused:duration  
##             3.305

Visualizing Interaction Models

Interaction allows the slope of the regression line in each group to vary. In this case, this means that the relationship between the final price and the length of the auction is moderated by the condition of each item.

Interaction models are easy to visualize in the data space with ggplot2 because they have the same coefficients as if the models were fit independently to each group defined by the level of the categorical variable. In this case, new and used MarioKarts each get their own regression line. To see this, we can set an aesthetic (e.g. color) to the categorical variable, and then add a geom_smooth() layer to overlay the regression line for each color.

# interaction plot
ggplot(marioKart, aes(y = totalPr, x = duration, color = cond)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  ylim(30,70)
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# simple linear regression plot
ggplot(marioKart, aes(y = totalPr, x = duration)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = 0) +
  ylim(30,70)
## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 5 rows containing missing values (geom_point).

lm(totalPr ~ duration, data = marioKart)
## 
## Call:
## lm(formula = totalPr ~ duration, data = marioKart)
## 
## Coefficients:
## (Intercept)     duration  
##     51.4246      -0.4097
# This is an exmaple of simpsons paradox. When you look at both used and new combined, the total price goes down with duration. But looking at them seperately shows what is really happening. 

Simpson’s Paradox

“For every additional thousand dollars of salary for teachers in a particular state, the expected SAT score for a student from that state is about 2 points higher, after controlling for the percentage of students taking the SAT.”

Multiple Linear Regression

# Fit the model using duration and startPr - two numeric variables, instead of a categorical
lm(totalPr ~ duration + startPr, data = marioKart)
## 
## Call:
## lm(formula = totalPr ~ duration + startPr, data = marioKart)
## 
## Coefficients:
## (Intercept)     duration      startPr  
##     50.6269      -0.5173       0.1371
# visualizing MLR in 3D
# draw the 3D scatterplot
p <- plot_ly(data = marioKart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
  add_markers() 
p